### processing combined BES data 

# load packages 
source("utils_pivotality_functions.R")

## get data output by wvms_bes_data_prep_all.R

D = read.csv("BES_2005_2010_2015_combined.csv")
D = D[order(D$year, D$refno, D$id), ]

parties = c("lab", "con", "ld", "ukip", "grn", "snp", "pc")

## get the ratings and set to NA in cases where the party won't be running (e.g. SNP in Wales)

U = D[,paste0(parties, "feel")]
U[D$country != "Scotland", "snpfeel"] = NA
U[D$country != "Wales", "pcfeel"] = NA
U_party_feelpre = U

U = D[,paste0(parties, "feelpost")]
U[D$country != "Scotland", "snpfeelpost"] = NA
U[D$country != "Wales", "pcfeelpost"] = NA
U_party_feelpost = U

## feelmix takes UKIP and Green prefs from the pre-election survey, because in 2005 and 2010 that is the only time people are asked about those two parties
U$ukipfeelpost = D$ukipfeel
U$grnfeelpost = D$grnfeel
U_party_feelmix = U

U = D[,paste0(parties, "leaderfeelpost")]
U[D$country != "Scotland", "snpleaderfeelpost"] = NA
U[D$country != "Wales", "pcleaderfeelpost"] = NA
U_leader_feelpost = U

## to make the party ID matrix I need to regularize the party stuff first
source("utils_party_regularizing.r")
D$partyID_reg = regularize_party_vec(D$partyID)
D$partyIDpost_reg = regularize_party_vec(D$partyIDpost)
D$partyIDsqueeze_reg = regularize_party_vec(D$partyIDsqueeze)
D$partyIDsqueezepost_reg = regularize_party_vec(D$partyIDsqueezepost)

# make single variable including squeeze
D$partyID_reg_incl_squeeze = D$partyID_reg
D$partyID_reg_incl_squeeze[D$partyID_reg %in% c("don't know", "none")] = D$partyIDsqueeze_reg[D$partyID_reg %in% c("don't know", "none")]

D$partyIDpost_reg_incl_squeeze = D$partyIDpost_reg
D$partyIDpost_reg_incl_squeeze[D$partyIDpost_reg %in% c("don't know", "none")] = D$partyIDsqueezepost_reg[D$partyIDpost_reg %in% c("don't know", "none")]

party_id_mat = matrix(NA, nrow = nrow(D), ncol = length(parties))
colnames(party_id_mat) = parties
for(j in 1:ncol(party_id_mat)){
  party_id_mat[,j] = as.integer(D$partyIDpost_reg_incl_squeeze == parties[j])
}

## and to make sincere vote matrix I use party_id_mat as tie breaker on U matrix. add the tie-breaker matrix first; make matrix indicating which column is the row maximum, then anyone without a unique maximum gets set to NA.

plurality_sincere_vote_matrix <- function(U, tie.breaker = NULL){
  # deliver a matrix of 1's and 0's
  if(!is.null(tie.breaker)){
    stopifnot(nrow(tie.breaker) == nrow(U) & ncol(tie.breaker) == ncol(U))
    U = U + tie.breaker
  }
  psvm = matrix(NA, nrow = nrow(U), ncol = ncol(U))
  max.U = apply(U, 1, max, na.rm = T)
  for(j in 1:ncol(psvm)){
    psvm[,j] = as.integer(U[,j] == max.U) # if more than one is the max, there this is a tie that cannot be resolved. 
  }
  psvm
}

# get the election results for computing the pivotal probabilities 
all_results = read.csv("all_results_05_10_15.csv")

# regularize vote 
D$vote.post.r = regularize_party_vec(D$vote.post)

# now I need to cycle through the types of preferences, precision levels, and information sources
pref_types = c("party_feelpre", "party_feelpost", "party_feelmix", "leader_feelpost")
info_types = c("actual", "forecast")
ss = c(85, 20, 12, 8)

years = c(2005, 2010, 2015)
how.many.not.missing = function(vec){sum(!is.na(vec))}

get_P_mat_from_disk = F # if set to T, use P_mat_list on disk -- this speeds up the code below, because we don't have to recompute the pivotal probabilities. 

if(get_P_mat_from_disk){
  load("P_mat_list.RData")
}else{
  P_mat_list = list()
}

D$drop_because_vote_missing_or_dont_know = is.na(D$vote.post.r) | D$vote.post.r == "don't know"

### an indicator that this person was not asked party like-dislike in 2015
D$not_asked_party_like_dislike = F # D$not_asked_party_like_dislike2 = F
D$not_asked_party_like_dislike[D$year == 2015 & apply(U_party_feelpost, 1, how.many.not.missing) == 0 & apply(U_leader_feelpost, 1, how.many.not.missing) > 0] = T


## indicator of concern for MP rather than for party or party leader 
D$reason_is_best_candidate = NA
D$reason_is_best_candidate[D$year == 2015] = grepl("best candidate", D$reasonForVote[D$year == 2015])
D$mentions_local_in_reason_for_vote_oth = grepl("[Ll]ocal", D$reasonForVoteOth)
D$mentions_mp_in_reason_for_vote_oth = grepl("\\s[Mm][Pp]\\s", D$reasonForVoteOth)

cat("Filling in tau and best insincere vote by year and constituency for:")
for(pref_type in pref_types){
  cat("\n -- ", pref_type, sep = "")
  U = get(paste0("U_", pref_type))
  S = plurality_sincere_vote_matrix(U, party_id_mat*.01)
  colnames(S) = parties
  S[is.na(S)] = 0
  
  # rules for inclusion: 
  hmnm = apply(U, 1, how.many.not.missing) # at least 3 non-missing prefs
  hmfaves = apply(S, 1, sum, na.rm = T) # 1 favorite party
  
  # some indicators so we can say why tau is missing 
  D[[paste0("drop_bc_too_few_prefs_", pref_type)]] = hmnm < 3
  D[[paste0("drop_bc_pref_tie_that_cant_be_resolved_by_partyID_", pref_type)]] = hmfaves > 1
  D[[paste0("drop_bc_no_favorite_", pref_type)]] = hmfaves == 0
  D[[paste0("how_many_not_missing_", pref_type)]] = hmnm
  
  use = (hmfaves == 1 & hmnm >= 3) # below we set tau to NA for Rs who don't satisfy these criteria: their tau would not be a reliable indicator of tactical vote incentive
  
  # define preferred party (called party1 for legacy reasons)
  D[[paste0("party1_", pref_type)]] = D[[paste0("left_", pref_type)]] = NA
  for(j in 1:length(parties)){
    D[[paste0("party1_", pref_type)]][S[,j] == 1] = parties[j]
  }
  # does this voter prefer Lab over Con, Con over Lab, or neither? 
  D[[paste0("left_", pref_type)]][U[,which(parties == "lab")] > U[,which(parties == "con")]] = TRUE
  D[[paste0("left_", pref_type)]][U[,which(parties == "lab")] < U[,which(parties == "con")]] = FALSE
  
  # set missing utilities to 0 -- this is like setting na.rm to T in the matrix multiplication.
  U[is.na(U)] = 0
  
  for(info_type in info_types){
    cat("\n    -- ", info_type, sep = "")
    for(s in ss){
      cat("\n      -- s=", s, sep = "")
      suffix = paste0(pref_type, "_", info_type, "_", s)
      tau_name = paste0("tau_", suffix)
      boo_name = paste0("boo_", suffix)
      vote_code_name = paste0("vote_code_", suffix)
      D[[tau_name]] = D[[boo_name]] = D[[vote_code_name]] = NA
      for(this_year in years){
        cat("\n        -- ", this_year, ": ", sep = "")
        all_refnos = sort(unique(D$refno[D$year == this_year]))
        counter = 0
        for(this_refno in all_refnos){
          if(this_year %in% c(2010, 2015) & this_refno == 108){next} # Buckingham -- Bercow seat. results data is nonsense. 
          counter = counter + 1
          if(counter%%100 == 0){cat(counter)}else if(counter%%10 == 0){cat(".")}
          these = !is.na(D$year) & !is.na(D$refno) & D$year == this_year & D$refno == this_refno
          this_U = as.matrix(U[these, ])
          this_S = S[these, ]
          # get the P matrix -- either from storage or by computing fresh.
          # across preference types it is the same P matrix, so this speeds up the computations across preference types by a lot.
          P_mat_key = paste0(this_year, "_", this_refno, "_", info_type, "_", s)
          if(P_mat_key %in% names(P_mat_list)){
            this_P = P_mat_list[[P_mat_key]]
          }else{
            this_v_vec = as.numeric(all_results[all_results$type == info_type & all_results$year == this_year & all_results$refno == this_refno, parties])
            # normalize the v_vec to sum to 1
            this_v_vec = this_v_vec/sum(this_v_vec, na.rm = T)
            this_alpha_vec = this_v_vec*s
            # as in original code, set missing alpha components to 0, effectively assuming party with no result did not run and is therefore hopeless.
            this_alpha_vec[is.na(this_alpha_vec)] = 0
            names(this_alpha_vec) = parties 
            # compute P matrix
            this_P = plurality.P.matrix.analytical(this_alpha_vec)
            P_mat_list[[P_mat_key]] = this_P
          }
          UP = this_U%*%this_P ## UP: the n x K matrix of expected utilities
          other_UP = (1 - this_S)*UP
          # compute tau 
          this_tau = apply(other_UP, 1, max, na.rm = T) - apply(this_S*UP, 1, max, na.rm = T)
          D[[tau_name]][these] = this_tau
          ## identify boo party: best non-sincere vote. depends on tau.  
          max_other_UP = apply(other_UP, 1, max)
          this_boo = rep(NA, length(this_tau))
          for(j in 1:length(parties)){
            this_boo[other_UP[,j] == max_other_UP] = parties[j]
          }
          D[[boo_name]][these] = this_boo
        }
        cat("done.")
      }
      # some computations we can do when we have filled out everyone 
      # set tau and boo to missing if we determined we should not be using it. 
      D[[tau_name]][!use] = NA
      D[[boo_name]][!use] = NA
      # make tau cats here too
      decile.cuts = quantile(D[[tau_name]], probs = seq(0, 1,.1), na.rm = T)
      decile.cuts[abs(decile.cuts) == min(abs(decile.cuts))] = 0
      D[[paste0(tau_name, "_cat")]] = NA
      decile.midpoints = c()
      for(i in 1:(length(decile.cuts) - 1)){
        D[[paste0(tau_name, "_cat")]][!is.na(D[[tau_name]]) & D[[tau_name]] >= decile.cuts[i] & D[[tau_name]] < decile.cuts[i+1]] = i
      }
      # indicate vote code, which depends on tau.
      # 1: voted for favorite
      # 2: vote for best of others (tactical vote) 
      # 3: vote for other
      D[[vote_code_name]] = NA
      valid.vote = !is.na(D$vote.post.r) & D$vote.post.r != "don't know"
      pref.data.present = !is.na(D[[paste0("party1_", pref_type)]]) & !is.na(D[[boo_name]])
      D[[vote_code_name]][valid.vote & pref.data.present] = 3
      D[[vote_code_name]][valid.vote & pref.data.present & D$vote.post.r == D[[paste0("party1_", pref_type)]]] = 1
      D[[vote_code_name]][valid.vote & pref.data.present & D$vote.post.r == D[[boo_name]]] = 2
    }
  }
}

# see checks below 
# a few more variables we want to create 

D$incomex = NA
D$incomex[D$incomehigh] = 3
D$incomex[D$incomemed] = 2
D$incomex[D$incomelow] = 1

D$educaltx = NA
D$educaltx[D$educuni == 1] = 3
D$educaltx[D$educuni == 0 & D$educl3plus == 1] = 2
D$educaltx[!is.na(D$educuni) & D$educuni == 0 & D$educl3plus == 0] = 1

D$agegrpx = NA
D$agegrpx[as.character(D$agegrp) == "below30"] = 1
D$agegrpx[as.character(D$agegrp) == "30to59"] = 2
D$agegrpx[as.character(D$agegrp) == "60plus"] = 3


### a measure of whether correctly judged the winner (winner correct)
# we ask: when assigning numbers to the likelihood of a given party winning, did R assign highest number to the party that actuallty won? 
# win_campaign was on a 0-10 scale in 2005 and 2010, 0-100 scale in 2015 -- doesn't matter given how we measure.  
D$max_likelihood_of_winning = apply(D[,paste0(parties, "_win_campaign")], 1, max, na.rm = T)
which.is.max = function(vec){which(vec == max(vec, na.rm = T))[1]} # in event of tie, takes first.
all_results$local_winner = parties[apply(all_results[,parties], 1, which.is.max)] # recall that all_results has actual, forecast, and poll. 
Da = merge(D, all_results[all_results$type == "actual", c("refno", "year", "local_winner")], by = c("year", "refno"), all.x = T)
D = Da
D = D[order(D$year, D$refno, D$id), ] # not sure if necessary -- does merge reorder the rows?
winner.mat = D$local_winner == parties[1]
for(party in parties[2:length(parties)]){
  winner.mat = cbind(winner.mat, D$local_winner == party)
}
colnames(winner.mat) = parties
D$likelihood.for.winner = apply(winner.mat*D[,paste0(parties, "_win_campaign")], 1, sum, na.rm = T)  # this gives 0 when it was NA for all. 
D$winner.correct = D$likelihood.for.winner == D$max_likelihood_of_winning


## and now the Fisher measure of tactical voting based on feel mix

#### first, code up Fisher tactical measure 

D$heath.tactical = F
D$heath.tactical[D$reasonForVote == 9999 | is.na(D$vote.post)] = NA
D$heath.tactical[grepl("no chance", D$reasonForVote) | grepl("tactical", D$reasonForVote)] = T

# now for the 2005 and 2010 waves, we exclude the ones who said they really preferred the party they actually voted for

D$heath.tactical[regularize_party_vec(D$vote.post) == regularize_party_vec(D$partyReallyPrefer) & !is.na(regularize_party_vec(D$partyReallyPrefer))] = F

# is there an efficient way to locate the tactical voters in the "reasonForVoteOth" people?  no. 
D$reasonForVoteOth[grepl("__NA__", D$reasonForVoteOth)] = NA
D$reasonForVoteOth[D$reasonForVoteOth == ""] = NA

tactical.patterns = c("[Tt]actical", "^[Ss]top ", "votes wasted", "any chance", "better chance of", "tactical vot", "^Best chance of", "^Best way to ensure that", "Did not want to waste my vote", "Didn't vote for UKIP as intended", "parents, tactical", "i was tactically voting", "i swapped vote", "voted tactically", "strategic vote") 

# a list of patterns about keeping someone out or keeping someone from winning. The issue here is that you might prefer Labour over Conservative and then say you voted Labour to keep out the Conservatives. This is why Fisher's like-dislike filter makes sense.
keep.out.patterns = c("^[Tt]o keep.{5,10}out", "^\\s*[Tt]o keep out", "^\\s*[Tt]o keep.{5,10}out", "^\\s*^[Tt]o stop ", "Did not want labour to win", "Didn't want ukip getting in", "Didnt want Labour to win again in Da", "Fight the Tories!", "I did not want the Conservative canditate to be elected", "I did not want the Tiries to win", "I did not want the conservatives to get elected", "I didn't want Conservative to win", "to get the conservative s out", "to get the tories out", "to keep labour candidate out", "to stop conservatives winning the seat", "wanted to keep another party out", "wanted rid of", "^voted against", "to keep labour from winning", "to get rid of Labour", "have had a chance to beat", "no chance of getting in", "take the vote away", "To try to oust the Tories", "To eject local", "To Keep out the Tories", "best chance of getting rid of", "TO KEEP LABOUR OUT", "TO GET RID OF THE TORIES", "I wanted to minimise the influence of the SNP.", "I wanted the Torys out", "I wanted the Tories out", "I voted to keep SNP out of seat", "I want Gordon Brown GONE!", "to stop Lab", "I really do not want a Conservative government", "I dread a Tory government.", "I disliked the alternative more", "didn't want.{4,10}to get in", "Had enough of Labour")    # not clear whether to include logic like "Had to stop Labour"

for(p in c(tactical.patterns, keep.out.patterns)){
  # 	cat("---\n")
  for(i in which(grepl(p, D$reasonForVoteOth))){
    # 		cat("This guy voted ", DDD$vote.post[i], " because: ", DDD$reasonForVoteOth[i], "\n\n", sep = "")
    D$heath.tactical[i] = T
  } 
}


no.patterns = c("Stop asylum seekers", "I didn't want UKIP in more than Conservatives so voted Conservatives to stop UKIP getting in", "to stop immigration", "so didn't have to worry about voting tactically", "Tactical vote against Westminster", "I also HATED that so many voters said they voted tactically", "In past elections I have voted tactically. This I", "I was going to vote tactically (i.e. for the Liberals), but", "I normally vote tactically but voted honestly this time", "so sometimes I vote tactically. However they were likely to win here", "I thought long and hard about voting tactically this time as I live in a LibDem/Tory marginal. But couldn't do it", "Normally I vote tactically Lib Dem")

for(p in c(no.patterns)){
  # 	cat("---\n")
  for(i in which(grepl(p, D$reasonForVoteOth))){
    # cat("Not tactical: This guy voted ", DDD$vote.post[i], " because: ", DDD$reasonForVoteOth[i], "\n\n", sep = "")
    D$heath.tactical[i] = F
  } 
}


# evidence of how futile this is 
# I didn't want the conservatives in for another 5 years -- point made above.
# "no credible alternative"
# "i voted for labour because my mom advised me to she put it to me in english term"
# "i vote for the same party as my husband"
# "i swapped vote"
# "lack of choice"
# "Liked the candidate's name"
# "It was a strategic vote based on the most likely outcomes."
# "I was spoken to by Labour and convinced to vote for them by the conversation"
# "I wanted to minimise the influence of the SNP."
# "I voted against Labour & Conservative rather than for Liberal democrats"
# I dread a Tory government.
# "I disliked the alternative more"
# "I would have voteed labour but Gordon Brown is a useless ejit"
# Did not want Conservatives - re Hunting Ban
## generally, voting against a leader -- is this tactical, or not? 
# "labour safe seat so atactical vote to get the lazy sod looking after his people"	
# Tactical vote against Westminster

# people's arguments are often things like, "I don't trust Gordon Brown" or "I hate the Conservatives". If you drilled down, their thinking might be consistent with instrumental voting, but they don't come right out and tell you. This is why it's dissatisfying. Or, they vote Labour because they don't like Cameron, but they don't say "it had the best leader". 

# On the other hand, many people who we have as tactical explicitly say it was because of the candidate.
# Also, someone could say they like party A, vote party B, and say it has the "best policies" -- liking a party may not be the same as saying you want it elected.  

# But Fisher is on to this, and this is why he adds these screens about who their favorite party is. And with that:  


### Fisher (2004)'s measure of tactical voting: Heath (i.e. explanation-based) with checks based on like-dislike scores. 

D$fisher.tactical = D$heath.tactical

## consistency checks: 
# clear favorite on strength of feeling equal to vote, i.e. they say they are tactical but they voted for party1post. 
clear.first.func = function(vec){
  sorted.vec = sort(vec, decreasing = T)
  sorted.vec[1] > sorted.vec[2]
}    
for(party in parties){
  if(party %in% c("ukip", "grn")){
    D[[paste0(party, "feelmix")]] = D[[paste0(party, "feelpost")]]
  }else{
    D[[paste0(party, "feelmix")]] = D[[paste0(party, "feel")]]
  }
}
D$clear.first.feelmix = apply(D[,paste0(parties, "feelmix")], 1, clear.first.func)
D$fisher.tactical[D$clear.first.feelmix & D$vote.post.r == D$party1_party_feelmix] = F

# when expressed "party really preferred", it is not the same as the clear favorite on strength of feeling
D$fisher.tactical[D$clear.first.feelmix & !is.na(D$partyReallyPrefer) & (regularize_party_vec(D$partyReallyPrefer) != D$party1_party_feelmix)] = F
# joint favorites on strength-of-feeling scores do not include "really preferred"
for(i in which(!is.na(D$partyReallyPrefer) & !D$clear.first.feelmix)){
  # the preferred party
  this.party = regularize_party(D$partyReallyPrefer[i])
  if(!this.party %in% parties){next} # if they prefer some party other than our standards, we let it be a tactical vote
  # the like dislike score assigned to that party
  this.like.dislike = D[[paste0(this.party, "feelmix")]][i]
  if(is.na(this.like.dislike)){next} # if we don't have a like dislike score for this party, we let it be a tactical vote 
  # the maximum like-dislike score assigned to any party
  max.like.dislike = max(D[i, paste0(parties, "feelmix")], na.rm = T)
  # if the "really preferred" party doesn't get at least joint-first like-dislike, Fisher says "not tactical"
  if(this.like.dislike < max(D[i, paste0(parties, "feelmix")], na.rm = T)){
    # 		cat("This guy said he really preferred ", this.party, ". But his like dislike score for that party was ", this.like.dislike, " and he gave another party a score of ", max.like.dislike, ".\n", sep = "")
    D$fisher.tactical[i] = F
  }
}

write.csv(D, "BES_all_including_all_taus_2005_2010_2015.csv", row.names = F)
save(P_mat_list, file = "P_mat_list.RData")


